library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
library(ggplot2)
library(tidyr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#----------------- Webscraping --------------------
download_ntia_datasets <- function(url = "https://www.ntia.gov/page/download-ntia-internet-use-survey-datasets",
file_types = c("csv.zip"),
download_dir = ".") {
page <- read_html(url)
links <- html_attr(html_nodes(page, "a"), "href")
# Filter for dataset file types
dataset_links <- links[grepl(paste(file_types, collapse = "|"), links, ignore.case = TRUE)]
# Define allowed two-digit odd years between 01 and 23
allowed_years <- sprintf("%02d", seq(1, 23, by = 2)) # "01", "03", ..., "23"
# Further filter: Only keep datasets from allowed years
filtered_links <- dataset_links[sapply(dataset_links, function(link) {
two_digit_years <- regmatches(link, gregexpr("\\d{2}", link))[[1]]
any(two_digit_years %in% allowed_years)
})]
# Make full URLs and download
for (link in filtered_links) {
full_link <- ifelse(grepl("^http", link), link, paste0("https://www.ntia.gov", link))
file_name <- basename(full_link)
dest_path <- file.path(download_dir, file_name)
message("Downloading (years 01-23, odd only): ", file_name)
tryCatch({
download.file(full_link, destfile = dest_path, mode = "wb")
}, error = function(e) {
warning("Failed to download: ", file_name)
})
}
}
download_ntia_datasets() # Now downloads only odd years from 01 to 23
## Downloading (years 01-23, odd only): nov23-cps-csv.zip
## Downloading (years 01-23, odd only): nov21-cps-csv.zip
## Downloading (years 01-23, odd only): nov19-cps-csv.zip
## Downloading (years 01-23, odd only): nov17-cps-csv.zip
## Downloading (years 01-23, odd only): jul15-cps-csv.zip
## Downloading (years 01-23, odd only): jul13-cps-csv.zip
## Downloading (years 01-23, odd only): jul11-cps-csv.zip
## Downloading (years 01-23, odd only): oct09-cps-csv.zip
## Downloading (years 01-23, odd only): oct07-cps-csv.zip
## Downloading (years 01-23, odd only): oct03-cps-csv.zip
## Downloading (years 01-23, odd only): sep01-cps-csv.zip
#----------------- Unzipping files -----------------------
unzip_ntia_files <- function(zip_dir = ".", unzip_dir = tempfile()) {
zip_files <- list.files(zip_dir, pattern = "\\.zip$", full.names = TRUE)
if (length(zip_files) == 0) {
stop("No zip files found in the specified directory.")
}
if (!dir.exists(unzip_dir)) {
dir.create(unzip_dir)
}
for (zip_path in zip_files) {
message("Unzipping: ", basename(zip_path))
unzip(zip_path, exdir = unzip_dir)
}
return(unzip_dir) # Return the path where files were unzipped
}
unzipped_folder <- unzip_ntia_files(zip_dir = ".", unzip_dir = "unzipped_ntia_files")
## Unzipping: jul11-cps-csv.zip
## Unzipping: jul13-cps-csv.zip
## Unzipping: jul15-cps-csv.zip
## Unzipping: nov17-cps-csv.zip
## Unzipping: nov19-cps-csv.zip
## Unzipping: nov21-cps-csv.zip
## Unzipping: nov23-cps-csv.zip
## Unzipping: oct03-cps-csv.zip
## Unzipping: oct07-cps-csv.zip
## Unzipping: oct09-cps-csv.zip
## Unzipping: sep01-cps-csv.zip
#---------------- reading csv -----------------------------
read_ntia_csvs <- function(csv_dir) {
csv_files <- list.files(csv_dir, pattern = "\\.csv$", full.names = TRUE)
if (length(csv_files) == 0) {
stop("No CSV files found in the specified directory.")
}
all_data <- list()
for (csv_path in csv_files) {
message("Reading: ", basename(csv_path))
data_name <- tools::file_path_sans_ext(basename(csv_path))
all_data[[data_name]] <- read.csv(csv_path)
}
return(all_data) # Saving a named list
}
ntia_datasets <- read_ntia_csvs(csv_dir = unzipped_folder)
## Reading: jul11-cps.csv
## Reading: jul13-cps.csv
## Reading: jul15-cps.csv
## Reading: nov17-cps.csv
## Reading: nov19-cps.csv
## Reading: nov21-cps.csv
## Reading: nov23-cps.csv
## Reading: oct03-cps.csv
## Reading: oct07-cps.csv
## Reading: oct09-cps.csv
## Reading: sep01-cps.csv
#---------------------- cleaning dataset-----------------------
clean_ntia_data <- function(dataset_list, na_value = -1) {
cleaned_list <- lapply(dataset_list, function(df) {
df[is.na(df)] <- na_value
return(df)
})
return(cleaned_list)
}
ntia_datasets <- clean_ntia_data(ntia_datasets)
#----------------- Selecting covariates requied for Analysis ---------------
select_covars <- function(datasets, columns_to_select_list) {
selected_data <- list()
for (dataset_name in names(datasets)) {
df <- datasets[[dataset_name]]
if (dataset_name %in% names(columns_to_select_list)) {
cols_wanted <- columns_to_select_list[[dataset_name]]
available_cols <- intersect(cols_wanted, names(df))
if (length(available_cols) == 0) {
warning("No matching columns found for ", dataset_name)
selected_data[[dataset_name]] <- df # return full df if no matching columns
} else {
selected_data[[dataset_name]] <- dplyr::select(df, all_of(available_cols))
}
} else {
# If no selection provided for this dataset, keep full dataset
selected_data[[dataset_name]] <- df
}
}
return(selected_data)
}
# Read the file as a string
if (file.exists("columns_to_select_list.txt")) {
list_code <- readLines("columns_to_select_list.txt")
list_code <- paste(list_code, collapse = "\n")
eval(parse(text = list_code))
message("Loaded: columns_to_select_list")
} else {
columns_to_select_list <- list(
"jul11-cps" = c("hryear4","peeduca","hufaminc","peage","pesex","hesci6", "hesci5","hesci3"),
"jul13-cps" = c("hryear4","peeduca","hesint1","henet3","henet2","henet3a","pedesk","pelapt","petabl",
"pecell","pegame","petvba","pehome", "pecafe", "peelhs", "pecomm",
"pelibr", "pewrka", "peschl","peperscr","hesci1","hesci11","henet3","henet2","peprim1","pedesk","pelapt","petabl","pecell","pegame","petvba",
"pehome", "pecafe", "peelhs", "pecomm","pelibr", "pewrka", "peschl","hesci1"),
"jul15-cps" = c("hryear4","peeduca","hefaminc","prtage","pesex","heinhome","heinschl","heincafe","heintrav","heinlico","heinelho", "heinwork",
"helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"nov17-cps" = c("hryear4","peeduca","hefaminc","prtage","pesex","heinhome","heinwork", "heinschl", "heincafe", "heintrav", "heinlico",
"heinelho", "heinothr","helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"nov19-cps" = c("hryear4","peeduca","hefaminc","prtage","pesex","heinhome","heinwork", "heinschl", "heincafe", "heintrav", "heinlico", "heinelho",
"heinothr","helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox",
"peemail", "petextim", "pesocial", "peconfer", "pevideo", "peaudio", "petelewk"),
"nov21-cps" = c("hryear4","peeduca","hefaminc","prtage","pesex","hedesktp","helaptop","hetablet","hemphone","hewearab","hetvbox",
"heinhome", "heinwork", "heinschl", "heincafe", "heinlico", "heintrav", "heinelho", "heinothr"),
"nov23-cps" = c("hryear4","peeduca","hefaminc","prtage","pesex","heinhome","heinwork", "heinschl", "heincafe", "heintrav", "heinlico", "heinelho",
"heinothr","henetchk","helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"oct03-cps" = c("hryear4","peeduca","hufaminc","prtage","pesex","hesint1","hesc1"),
"oct07-cps" = c("hryear4","peeduca","hufaminc","peage","pesex","henet1"),
"oct09-cps" = c("hryear4","peeduca","hufaminc","peage","pesex","henet1"),
"sep01-cps" = c("hryear4","peeduca","hufaminc","prtage","pesex","hesint1","hesc1")
)
}
## Loaded: columns_to_select_list
# Select specific covariates
ntia_selected <- select_covars(ntia_datasets, columns_to_select_list)
ntia_selected <- ntia_selected[order(as.numeric(gsub("\\D", "", names(ntia_selected))))]
#----------------- getting population size for every year ------------
count_ntia_rows <- function(dataset_list) {
row_counts <- sapply(dataset_list, nrow)
print(row_counts)
return(row_counts)
}
row_counts <- count_ntia_rows(ntia_selected)
## sep01-cps oct03-cps oct07-cps oct09-cps jul11-cps jul13-cps jul15-cps nov17-cps
## 158865 156941 151370 152246 152260 150308 149416 147801
## nov19-cps nov21-cps nov23-cps
## 138850 127375 126917
#------------------ Merging all the data -----------------------------
standardize_columns <- function(df, all_columns, filler = -1) {
missing_cols <- setdiff(all_columns, names(df))
for (col in missing_cols) {
df[[col]] <- filler
}
# Reorder to match all_columns
df <- df[, all_columns]
return(df)
}
all_columns <- unique(unlist(lapply(ntia_selected, names)))
standardized_list <- lapply(ntia_selected, standardize_columns, all_columns = all_columns)
merged_ntia <- do.call(rbind, standardized_list)
#----------------- Merging columns that are the same -----------------------
merge_columns <- function(col1, col2, na_value = -1) {
merged <- ifelse(col1 != na_value, col1,
ifelse(col2 != na_value, col2, na_value))
return(merged)
}
merged_ntia$peage <- merge_columns(merged_ntia$peage, merged_ntia$prtage)
merged_ntia$hufaminc <- merge_columns(merged_ntia$hufaminc, merged_ntia$hefaminc)
#================= Categorizing for the ease in Interaction with teammates Data ==================
#---------------- Categorizing for the analysis by Income -------------------------
collapse_income_brackets <- function(df, income_var , new_var) {
df[[new_var]] <- with(df, ifelse(df[[income_var]] %in% 1:6, 1,
ifelse(df[[income_var]] %in% 7:11, 2,
ifelse(df[[income_var]] %in% 12:13, 3,
ifelse(df[[income_var]] %in% 14:16, 4, -1)))))
return(df)
}
merged_ntia <- collapse_income_brackets(merged_ntia,"hufaminc","hefaminc")
#---------------- Categorizing for the analysis by Age -------------------------
categorize_age_base <- function(age) {
ifelse(age >= 18 & age <= 20, 4,
ifelse(age >= 21 & age <= 25, 5,
ifelse(age >= 26 & age <= 34, 6,
ifelse(age >= 35, 7,
-1))))
}
merged_ntia$prtage <- categorize_age_base(merged_ntia$peage)
#---------------- Categorizing for the analysis by Educational Qualifications -------------------------
categorize_education <- function(df, educ_var, age_var, new_var = "educ_group") {
df[[new_var]] <- with(df, ifelse(df[[age_var]] <= 6, 5,
ifelse(df[[educ_var]] <= 38 & df[[age_var]] >= 7, 1,
ifelse(df[[educ_var]] == 39 & df[[age_var]] >= 7, 2,
ifelse(df[[educ_var]] %in% 40:42 & df[[age_var]] >= 7, 3,
ifelse(df[[educ_var]] >= 43 & df[[age_var]] >= 7, 4, -1))))))
return(df)
}
merged_ntia <- categorize_education(merged_ntia, "peeduca","peage")
valid_vars_by_year_net <- list(
"2001" = c("hesint1"),
"2003" = c("hesint1"),
"2007" = c("henet1"),
"2009" = c("henet1"),
"2011" = c("hesci6", "hesci5"),
"2013" = c("henet3","henet2","henet3a","pedesk","pelapt","petabl",
"pecell","pegame","petvba","pehome", "pecafe", "peelhs", "pecomm",
"pelibr", "pewrka", "peschl","peperscr","hesci1","hesci11"),
"2015" = c("heinhome","heinschl","heincafe","heintrav","heinlico","heinelho", "heinwork",
"helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"2017" = c("heinhome","heinwork", "heinschl", "heincafe", "heintrav", "heinlico",
"heinelho", "heinothr","helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"2019" = c("heinhome","heinwork", "heinschl", "heincafe", "heintrav", "heinlico", "heinelho",
"heinothr","helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox",
"peemail", "petextim", "pesocial", "peconfer", "pevideo", "peaudio", "petelewk"),
"2021" = c("hedesktp","helaptop","hetablet","hemphone","hewearab","hetvbox",
"heinhome", "heinwork", "heinschl", "heincafe", "heinlico", "heintrav", "heinelho", "heinothr"),
"2023" = c("heinhome","heinwork", "heinschl", "heincafe", "heintrav", "heinlico", "heinelho",
"heinothr","henetchk","helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox")
)
valid_vars_by_year_device <- list(
"2001" = c("hesc1"),
"2003" = c("hesc1"),
"2011" = c("hesci3"),
"2013" = c("henet3","henet2","peprim1","pedesk","pelapt","petabl","pecell","pegame","petvba",
"pehome", "pecafe", "peelhs", "pecomm","pelibr", "pewrka", "peschl","hesci1"),
"2015" = c("helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"2017" = c("helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"2019" = c("helaptop", "hedesktp", "hetablet", "hemphone", "hewearab", "hetvbox"),
"2021" = c("hedesktp","helaptop","hetablet","hemphone","hewearab","hetvbox"),
"2023" = c("helaptop","hedesktp","hetablet","hemphone","hewearab","hetvbox")
)
#--------------- Function for getting technology use by education qualifications
summarize_by_educ_group <- function(valid_vars_by_year) {
result <- bind_rows(
lapply(names(valid_vars_by_year), function(y) {
vars <- valid_vars_by_year[[y]]
vars <- vars[vars %in% colnames(merged_ntia)] # Keep only existing vars
if (length(vars) == 0) return(NULL)
merged_ntia %>%
filter(hryear4 == as.integer(y), educ_group %in% c(1, 2, 3, 4, 5)) %>%
mutate(
has_one = rowSums(select(., all_of(vars)) == 1 & select(., all_of(vars)) != -1, na.rm = TRUE) > 0
) %>%
group_by(hryear4, educ_group) %>%
summarise(
tech_users = sum(has_one),
total_people = n(),
.groups = "drop"
)
})
)
return(result)
}
#--------------- Function for getting technology use by particular age categories
summarize_by_age_group <- function(valid_vars_by_year) {
result <- bind_rows(
lapply(names(valid_vars_by_year), function(y) {
vars <- valid_vars_by_year[[y]]
vars <- vars[vars %in% colnames(merged_ntia)] # Keep only existing vars
if (length(vars) == 0) return(NULL)
merged_ntia %>%
filter(hryear4 == as.integer(y), prtage %in% c(4, 5,6,7)) %>%
mutate(
has_one = rowSums(select(., all_of(vars)) == 1 & select(., all_of(vars)) != -1, na.rm = TRUE) > 0
) %>%
group_by(hryear4, prtage) %>%
summarise(
tech_users = sum(has_one),
total_people = n(),
.groups = "drop"
)
})
)
return(result)
}
#--------------- Function for getting technology use by particular gender categories
summarize_by_gender_group <- function(valid_vars_by_year) {
result <- bind_rows(
lapply(names(valid_vars_by_year), function(y) {
vars <- valid_vars_by_year[[y]]
vars <- vars[vars %in% colnames(merged_ntia)] # Keep only existing vars
if (length(vars) == 0) return(NULL)
merged_ntia %>%
filter(hryear4 == as.integer(y), pesex %in% c(1, 2)) %>%
mutate(
has_one = rowSums(select(., all_of(vars)) == 1 & select(., all_of(vars)) != -1, na.rm = TRUE) > 0
) %>%
group_by(hryear4, pesex) %>%
summarise(
tech_users = sum(has_one),
total_people = n(),
.groups = "drop"
)
})
)
return(result)
}
#--------------- Function for getting technology use by particular income categories
summarize_by_income_group <- function(valid_vars_by_year) {
result <- bind_rows(
lapply(names(valid_vars_by_year), function(y) {
vars <- valid_vars_by_year[[y]]
vars <- vars[vars %in% colnames(merged_ntia)] # Keep only existing vars
if (length(vars) == 0) return(NULL)
merged_ntia %>%
filter(hryear4 == as.integer(y), hefaminc %in% c(1, 2, 3, 4)) %>%
mutate(
has_one = rowSums(select(., all_of(vars)) == 1 & select(., all_of(vars)) != -1, na.rm = TRUE) > 0
) %>%
group_by(hryear4, hefaminc) %>%
summarise(
tech_users = sum(has_one),
total_people = n(),
.groups = "drop"
)
})
)
return(result)
}
#--------------- Function for getting technology use by total population
summarize_tech_use_total <- function(varlist_by_year) {
result <- bind_rows(
lapply(names(varlist_by_year), function(y) {
vars <- varlist_by_year[[y]]
vars <- vars[vars %in% colnames(merged_ntia)] # Only keep existing variables
if (length(vars) == 0) return(NULL) # Skip if no valid columns
merged_ntia %>%
filter(hryear4 == as.integer(y)) %>%
mutate(
has_one = rowSums(select(., all_of(vars)) == 1 & select(., all_of(vars)) != -1, na.rm = TRUE) > 0
) %>%
summarise(hryear4 = first(hryear4), tech_users = sum(has_one))
})
)
return(result)
}
#======================== Calculating Households with Internet ===========================
result_users_net <- summarize_tech_use_total(valid_vars_by_year_net)
result_users_net_plots <- ggplot(result_users_net, aes(x = hryear4, y = tech_users)) +
geom_line()+
geom_point(size = 2) +
labs(
title = "Internet Usage Over The Years",
x = "Year",
y = "Number of Users"
) +
theme_light()
#=========================== Calculating Internet access according to Income categories======================
result_income_net <- summarize_by_income_group(valid_vars_by_year_net)
result_income_net_plots <- ggplot(result_income_net, aes(x = hryear4, y = tech_users, color = factor(hefaminc), group = hefaminc)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Internet Usage By Income Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Income"
) +
theme_light()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#=========================== Calculating Internet access according to Gender categories======================
result_gender_net <- summarize_by_gender_group(valid_vars_by_year_net)
result_gender_net_plots <- ggplot(result_gender_net, aes(x = hryear4, y = tech_users, color = factor(pesex), group = pesex)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Internet Usage By Gender Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Gender"
) +
theme_light()
#=========================== Calculating Internet access according to Age categories======================
result_age_net <- summarize_by_age_group(valid_vars_by_year_net)
result_age_net_plots <- ggplot(result_age_net, aes(x = hryear4, y = tech_users, color = factor(prtage), group = prtage)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Internet Usage By Age Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Age"
) +
theme_light()
#=========================== Calculating Internet access according to Education categories======================
result_education_net <- summarize_by_educ_group(valid_vars_by_year_net)
result_education_net_plots <- ggplot(result_education_net, aes(x = hryear4, y = tech_users, color = factor(educ_group), group = educ_group)) +
geom_line()+
geom_point(size = 2) +
labs(
title = "Internet Usage By Education Qualification Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Education"
) +
theme_light()
#======================= Calculating Device access to every household ===========================
result_users_dev <- summarize_tech_use_total(valid_vars_by_year_device)
result_users_dev_plots <- ggplot(result_users_dev, aes(x = hryear4, y = tech_users)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
scale_x_continuous(
breaks = seq(min(result_users_dev$hryear4), max(result_users_dev$hryear4), by = 2)
) +
scale_y_continuous(
labels = label_number() # No scientific notation, no commas
) +
labs(
title = "Technology Usage Over The Years",
x = "Year",
y = "User Count"
) +
theme_light(base_size = 14)
#=========================== Calculating technology access according to Income categories======================
result_income_dev <- summarize_by_income_group(valid_vars_by_year_device)
result_income_dev_plots <- ggplot(result_income_dev, aes(x = hryear4, y = tech_users, color = factor(hefaminc), group = hefaminc)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Technology Usage By Income Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Income"
) +
theme_light()
#=========================== Calculating technology access according to Gender categories======================
result_gender_dev <- summarize_by_gender_group(valid_vars_by_year_device)
result_gender_dev_plots <- ggplot(result_gender_dev, aes(x = hryear4, y = tech_users, color = factor(pesex), group = pesex)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Technology Usage By Gender Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Gender"
) +
theme_light()
#=========================== Calculating technology access according to Age categories======================
result_age_dev <- summarize_by_age_group(valid_vars_by_year_device)
result_age_dev_plots <- ggplot(result_age_dev, aes(x = hryear4, y = tech_users, color = factor(prtage), group = prtage)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Technology Usage By Age Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Age"
) +
theme_light()
#=========================== Calculating technology access according to Education Qualifications categories======================
result_education_dev <- summarize_by_educ_group(valid_vars_by_year_device)
result_education_dev_plots <- ggplot(result_education_dev, aes(x = hryear4, y = tech_users, color = factor(educ_group), group = educ_group)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Technology Usage By Education Qualification Bracket Over The Years",
x = "Year",
y = "Number of Users",
color = "Education"
) +
theme_light()
# Interactive plot
plot_income <- result_income_net %>%
mutate(percent = 100 * tech_users / total_people)
p_income <- ggplot(plot_income, aes(x = hryear4, y = percent, color = factor(hefaminc))) +
geom_line(size = 1.2) +
geom_point(size = 2) +
labs(title = "Tech Use by Income Group",
x = "Year", y = "Percent", color = "Income Group") +
theme_minimal()
ggplotly(p_income)
# Gender
p_gender <- ggplot(result_gender_net %>%
mutate(percent = 100 * tech_users / total_people),
aes(x = hryear4, y = percent, color = factor(pesex))) +
geom_line(size = 1.2) + geom_point(size = 2) +
labs(title = "Tech Use by Gender", x = "Year", y = "Percent", color = "Gender") +
theme_minimal()
# Age
p_age <- ggplot(result_age_net %>%
mutate(percent = 100 * tech_users / total_people),
aes(x = hryear4, y = percent, color = factor(prtage))) +
geom_line(size = 1.2) + geom_point(size = 2) +
labs(title = "Tech Use by Age Group", x = "Year", y = "Percent", color = "Age Group") +
theme_minimal()
# Education
p_education <- ggplot(result_education_net %>%
mutate(percent = 100 * tech_users / total_people),
aes(x = hryear4, y = percent, color = factor(educ_group))) +
geom_line(size = 1.2) + geom_point(size = 2) +
labs(title = "Tech Use by Education", x = "Year", y = "Percent", color = "Education Group") +
theme_minimal()
ggplotly(p_income)
ggplotly(p_gender)
ggplotly(p_age)
ggplotly(p_education)
# Interactive plot
plot_income <- result_income_dev %>%
mutate(percent = 100 * tech_users / total_people)
p_income <- ggplot(plot_income, aes(x = hryear4, y = percent, color = factor(hefaminc))) +
geom_line(size = 1.2) +
geom_point(size = 2) +
labs(title = "Tech Use by Income Group",
x = "Year", y = "Percent", color = "Income Group") +
theme_minimal()
ggplotly(p_income)
# Gender
p_gender <- ggplot(result_gender_dev %>%
mutate(percent = 100 * tech_users / total_people),
aes(x = hryear4, y = percent, color = factor(pesex))) +
geom_line(size = 1.2) + geom_point(size = 2) +
labs(title = "Tech Use by Gender", x = "Year", y = "Percent", color = "Gender") +
theme_minimal()
# Age
p_age <- ggplot(result_age_dev %>%
mutate(percent = 100 * tech_users / total_people),
aes(x = hryear4, y = percent, color = factor(prtage))) +
geom_line(size = 1.2) + geom_point(size = 2) +
labs(title = "Tech Use by Age Group", x = "Year", y = "Percent", color = "Age Group") +
theme_minimal()
# Education
p_education <- ggplot(result_education_dev %>%
mutate(percent = 100 * tech_users / total_people),
aes(x = hryear4, y = percent, color = factor(educ_group))) +
geom_line(size = 1.2) + geom_point(size = 2) +
labs(title = "Tech Use by Education", x = "Year", y = "Percent", color = "Education Group") +
theme_minimal()
ggplotly(p_income)
ggplotly(p_gender)
ggplotly(p_age)
ggplotly(p_education)
#======================== merging to interact with teammates data=====================
income_net <- result_income_net %>%
mutate(
category = "Income",
subgroup = hefaminc,
percent = 100 * tech_users / total_people,
tech_inc_u = tech_users,
tot_tech_inc_u = total_people
) %>%
select(hryear4, category, subgroup,percent, tech_inc_u,tot_tech_inc_u)
gender_net <- result_gender_net %>%
mutate(
category = "Gender",
subgroup = pesex,
percent = 100 * tech_users / total_people,
tech_gen_u = tech_users,
tot_tech_gen_u = total_people
) %>%
select(hryear4, category, subgroup,percent, tech_gen_u,tot_tech_gen_u)
age_net <- result_age_net %>%
mutate(
category = "Age",
subgroup = prtage,
percent = 100 * tech_users / total_people,
tech_age_u = tech_users,
tot_tech_age_u = total_people
) %>%
select(hryear4, category, subgroup,percent, tech_age_u, tot_tech_age_u)
education_net <- result_education_net %>%
mutate(
category = "Education",
subgroup = educ_group,
percent = 100 * tech_users / total_people,
tech_edu_u = tech_users,
tot_tech_edu_u = total_people
) %>%
select(hryear4, category, subgroup,percent, tech_edu_u, tot_tech_edu_u)
# Combine them safely
combined_results_net <- bind_rows(income_net, gender_net, age_net, education_net)
wide_combined_results_net <- combined_results_net %>%
pivot_wider(
names_from = category,
values_from = percent
)
income_dev <- result_income_dev %>%
mutate(
category = "Income",
subgroup = hefaminc,
dev_inc_u = tech_users,
tot_dev_inc_u = total_people,
percent = 100 * tech_users / total_people
) %>%
select(hryear4, category, subgroup, percent,dev_inc_u, tot_dev_inc_u)
gender_dev <- result_gender_dev %>%
mutate(
category = "Gender",
subgroup = pesex,
dev_gen_u = tech_users,
tot_dev_gen_u = total_people,
percent = 100 * tech_users / total_people
) %>%
select(hryear4, category, subgroup, percent,dev_gen_u,tot_dev_gen_u)
age_dev <- result_age_dev %>%
mutate(
category = "Age",
subgroup = prtage,
dev_age_u = tech_users,
tot_dev_age_u = total_people,
percent = 100 * tech_users / total_people
) %>%
select(hryear4, category, subgroup, percent,dev_age_u, tot_dev_age_u)
education_dev <- result_education_dev %>%
mutate(
category = "Education",
subgroup = educ_group,
dev_edu_u = tech_users,
tot_dev_edu_u = total_people,
percent = 100 * tech_users / total_people
) %>%
select(hryear4, category, subgroup, percent,,dev_edu_u, tot_dev_edu_u)
# Combine them safely
combined_results_dev <- bind_rows(income_dev, gender_dev, age_dev, education_dev)
wide_combined_results_dev <- combined_results_dev %>%
pivot_wider(
names_from = category,
values_from = percent
)
# Replace -1 with NA for relevant variables
merged_ntia1 <- merged_ntia %>%
mutate(
pesex = ifelse(pesex == -1, NA, pesex),
hufaminc = ifelse(hufaminc == -1, NA, hufaminc),
prtage = ifelse(prtage == "-1", NA, prtage),
educ_group = ifelse(educ_group == "-1", NA, educ_group)
)
merged_ntia1$has_net <- NA # initialize
for (year in names(valid_vars_by_year_net)) {
year_int <- as.integer(year)
vars <- valid_vars_by_year_net[[year]]
vars <- vars[vars %in% colnames(merged_ntia1)] # only use columns that exist
# If there are valid columns to check
if (length(vars) > 0) {
year_rows <- merged_ntia1$hryear4 == year_int
# Subset the rows for that year
subset_df <- merged_ntia1[year_rows, vars, drop = FALSE]
# Calculate has_device: 1 if any variable == 1 (and not -1), else 0
merged_ntia1$has_net[year_rows] <- rowSums(subset_df == 1 & subset_df != -1, na.rm = TRUE) > 0
}
}
# Convert TRUE/FALSE to 1/0
merged_ntia1$has_net <- as.integer(merged_ntia1$has_net)
merged_ntia1$has_device <- NA
for (year in names(valid_vars_by_year_device)) {
year_int <- as.integer(year)
vars <- valid_vars_by_year_device[[year]]
vars <- vars[vars %in% colnames(merged_ntia1)] # only use columns that exist
# If there are valid columns to check
if (length(vars) > 0) {
year_rows <- merged_ntia1$hryear4 == year_int
# Subset the rows for that year
subset_df <- merged_ntia1[year_rows, vars, drop = FALSE]
# Calculate has_device: 1 if any variable == 1 (and not -1), else 0
merged_ntia1$has_device[year_rows] <- rowSums(subset_df == 1 & subset_df != -1, na.rm = TRUE) > 0
}
}
# Convert TRUE/FALSE to 1/0
merged_ntia1$has_device <- as.integer(merged_ntia1$has_device)
educated_levels <- c(1,2,3,4,5)
tech_use_summary <- merged_ntia1 %>%
filter(
educ_group %in% educated_levels
) %>%
mutate(
gender = case_when(
pesex == 1 ~ "Male",
pesex == 2 ~ "Female",
TRUE ~ NA_character_
)
) %>%
group_by(hryear4, prtage, hufaminc, pesex) %>%
summarise(
avg_tech_use = mean(has_device, na.rm = TRUE) * 100,
n = n(),
.groups = "drop"
)
net_use_summary <- merged_ntia1 %>%
filter(
educ_group %in% educated_levels
) %>%
mutate(
gender = case_when(
pesex == 1 ~ "Male",
pesex == 2 ~ "Female",
TRUE ~ NA_character_
)
) %>%
group_by(hryear4, prtage, hufaminc, pesex) %>%
summarise(
avg_net_use = mean(has_net, na.rm = TRUE) * 100,
n = n(),
.groups = "drop"
)
#========================================Saving plots===========================================
plot_list <- list(result_age_net_plots, result_users_net_plots, result_education_net_plots, result_gender_net_plots,
result_income_net_plots, result_age_dev_plots, result_education_dev_plots, result_gender_dev_plots,
result_income_dev_plots,result_users_dev_plots )
plot_names <- c(
"age_net", "users_net", "education_net", "gender_net", "income_net",
"age_dev", "education_dev", "gender_dev", "income_dev", "users_dev"
)
dir.create("plots", showWarnings = FALSE)
for (i in seq_along(plot_list)) {
ggsave(
filename = paste0("plots/", plot_names[i], ".png"),
plot = plot_list[[i]],
width = 8, height = 6, dpi = 300
)
}
#====================================Saving Result data====================================
result_names <- c(
"result_users_net","result_education_net", "result_age_net","result_gender_net","result_income_net",
"result_users_dev", "result_age_dev", "result_education_dev","result_gender_dev","result_income_dev",
"wide_combined_results_dev","wide_combined_results_net", "net_use_summary", "tech_use_summary","merged_ntia")
# Create a folder if it doesn't exist
dir.create("results", showWarnings = FALSE)
# Loop and save each one to a CSV
for (name in result_names) {
df <- get(name)
write.csv(df, file = paste0("results/", name, ".csv"), row.names = FALSE)
}